# Install required packages if not already installed
if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install(c("SNPRelate", "GENESIS", "qqman", "GeneNet", "xlsx", "GWASTools", "SeqVarTools", "SeqArray", "RCy3"))

if(!require("SNPRelate", quietly = TRUE))
    install.packages("SNPRelate")

if(!require("GENESIS", quietly = TRUE))
    install.packages("GENESIS")

if(!require("dplyr", quietly = TRUE))
    install.packages("dplyr")

if(!require("openxlsx", quietly = TRUE))
    install.packages("openxlsx")

if(!require("readxl", quietly = TRUE))
    install.packages("readxl")

if(!require("ggplot2", quietly = TRUE))
    install.packages("ggplot2")

if(!require("qqman", quietly = TRUE))
    install.packages("qqman")

Task 1 : Compute Kinship using SNPRelate and GENESIS

# Load required libraries
library(SNPRelate)
library(GENESIS)
library(GWASTools)
library(SeqVarTools)
library(SeqArray)
library(dplyr)
library(openxlsx)
library(readxl)
library(ggplot2)
library(qqman)
# Convert PLINK format to GDS format
snpgdsPED2GDS("Dataset/input.ped", "Dataset/input.map", "Dataset/genotype.gds")

# Open the GDS file
genofile <- snpgdsOpen("Dataset/genotype.gds")

# Calculate kinship using MLE method
ibd <- snpgdsIBDMLE(genofile, 
                        num.thread = 6,  
                        kinship.constraint = TRUE,
                        kinship = TRUE)  

kinship <- ibd$kinship
rownames(kinship) <- colnames(kinship) <- ibd$sample.id

# Close the GDS file
snpgdsClose(genofile)

# Count individuals with kinship > 0.1
high_kinship <- (sum(kinship > 0.1, na.rm = TRUE) - 156) / 2.0  
cat("Number of individuals with kinship > 0.1:", high_kinship, "\n")
write.xlsx(kinship, file = "Dataset/Kinship_Matrix.xlsx", rowNames = TRUE)
kinship <- as.matrix(read.xlsx("Dataset/Kinship_Matrix.xlsx", rowNames = TRUE))

Task 2: Compute mQTLs with Mixed Models

# Load PCA data
pca_data <- read.table("Dataset/Dataset of 156 Qataris/pca_results.eigenvec", header = FALSE)
colnames(pca_data) <- c("FID", "IID", paste0("PC", 1:20))
rownames(pca_data) <- pca_data$IID
pcs <- pca_data[, c("PC1", "PC2", "PC3")]

# Load metabolite data
metabolites <- read.csv("Dataset/qatari_metabolites_2025.csv", row.names = 1)

# Merge PCA with metabolite data
all.data <- as.data.frame(cbind(pcs, metabolites))
meta.names <- colnames(metabolites)

# Sample IDs
scan_annot_df <- data.frame(
  scanID = rownames(metabolites),
  stringsAsFactors = FALSE
)
rownames(scan_annot_df) <- scan_annot_df$scanID
scan_annot <- ScanAnnotationDataFrame(scan_annot_df)
genoFile <- "Dataset/genotype.gds"
gds <- GdsGenotypeReader(genoFile)
genoData <- GenotypeData(gds, scanAnnot = scan_annot)
results <- list()
null_models <- list()

for (meta in meta.names) {
  df <- all.data[, c("PC1", "PC2", "PC3", meta)]
  colnames(df)[4] <- "trait"
  
  null.model <- fitNullModel(
    x = df,
    outcome = "trait",
    covars = c("PC1", "PC2", "PC3"),
    cov.mat = kinship,
    family = "gaussian",
    verbose = TRUE
  )
  
  genoIterator <- GenotypeBlockIterator(genoData)
  assoc <- assocTestSingle(
    gdsobj = genoIterator,
    null.model = null.model,
    verbose = TRUE
  )
  
  null_models[[meta]] <- null.model
  assoc$metabolite <- meta
  results[[meta]] <- assoc
}
## Computing Variance Component Estimates...
## Sigma^2_A     log-lik     RSS
## [1]  140.150908  140.150908 -646.874628    1.350861
## [1]   21.354672  210.457918 -646.557174    1.269528
## [1]    7.279477  218.697709 -646.527236    1.261173
## [1]    0.328382  222.764840 -646.512966    1.257109
## [1]    0.1125786  222.8910805 -646.5125283    1.2569830
## [1]    0.000000  222.954187 -646.512300    1.256933
## [1]    0.000000  268.528849 -646.512300    1.043607
## [1]    0.000000  279.749204 -646.512300    1.001749
## [1]    0.000000  280.237632 -646.512300    1.000003
## Using 6 CPU cores
## Computing Variance Component Estimates...
## Sigma^2_A     log-lik     RSS
## [1]  142.31419  142.31419 -648.93409    1.36687
## [1]  417.72541   64.47409 -648.82217    1.07896
## [1]  504.339488   32.141102 -648.808332    1.040985
## [1]  553.922749   13.520060 -648.804425    1.020921
## [1]  580.055431    3.735468 -648.803434    1.010575
## [1]  586.633755    1.291592 -648.803295    1.007939
## [1]  589.0900642    0.3820982 -648.8032535    1.0069476
## [1]  589.6259740    0.1839682 -648.8032453    1.0067305
## [1]  589.88538667    0.08809481 -648.80324142    1.00662534
## [1]  590.01302450    0.04093073 -648.80323954    1.00657358
## [1]  590.07633    0.00000 -648.80324    1.00661
## [1]  593.950907    0.000000 -648.803238    1.000043
## Using 6 CPU cores
## Computing Variance Component Estimates...
## Sigma^2_A     log-lik     RSS
## [1]  138.26954  138.26954 -646.07942    1.35499
## [1]  411.716609   30.907443 -645.748661    1.200057
## [1]  475.975517    4.360440 -645.692907    1.175653
## [1]  483.557490    1.214955 -645.686793    1.172916
## [1]  485.4382023    0.4344023 -645.6852918    1.1722413
## [1]  486.37670238    0.04485858 -645.68454466    1.17190501
## [1]  486.435301    0.000000 -645.684459    1.171985
## [1]  557.818278    0.000000 -645.684459    1.022009
## [1]  569.830737    0.000000 -645.684459    1.000464
## [1]  570.0950    0.0000 -645.6845    1.0000
## Using 6 CPU cores
## Computing Variance Component Estimates...
## Sigma^2_A     log-lik     RSS
## [1]  160.421757  160.421757 -655.088129    1.314857
## [1]  389.875348   61.635530 -654.705739    1.241815
## [1]  499.680186   13.908812 -654.555711    1.212554
## [1]  525.881455    2.495458 -654.522693    1.205961
## [1]  529.112725    1.087391 -654.518692    1.205158
## [1]  530.7255707    0.3845445 -654.5167009    1.2047572
## [1]  531.53129438    0.03341862 -654.51570773    1.20455725
## [1]  531.581630    0.000000 -654.515613    1.204598
## [1]  621.869435    0.000000 -654.515613    1.029705
## [1]  639.809230    0.000000 -654.515613    1.000833
## [1]  640.341690    0.000000 -654.515613    1.000001
## Using 6 CPU cores
## Computing Variance Component Estimates...
## Sigma^2_A     log-lik     RSS
## [1]  172.144976  172.144976 -659.273384    1.294684
## [1]   35.702989  267.246050 -659.178850    1.163019
## [1]    6.584969  286.380269 -659.166335    1.143013
## [1]    3.211298  288.586812 -659.165016    1.140783
## [1]    1.540975  289.678800 -659.164372    1.139685
## [1]    0.7099403  290.2219800 -659.1640543    1.1391392
## [1]    0.2954546  290.4928674 -659.1638964    1.1388675
## [1]    0.08846968  290.62813550 -659.16381766    1.13873190
## [1]    0.03675566  290.66193060 -659.16379801    1.13869803
## Using 6 CPU cores
## Computing Variance Component Estimates...
## Sigma^2_A     log-lik     RSS
## [1]  121.245311  121.245311 -634.579372    1.328256
## [1]  233.338879  111.172919 -634.564210    1.065211
## [1]  319.532549   82.985884 -634.557998    1.003869
## [1]  349.929588   69.133397 -634.557133    1.000032
## [1]  358.335712   65.058841 -634.557062    1.000001
## [1]  360.70381   63.90936 -634.55706    1.00000
## [1]  361.37285   63.58455 -634.55706    1.00000
## [1]  361.56199   63.49272 -634.55706    1.00000
## [1]  361.61548   63.46675 -634.55706    1.00000
## Using 6 CPU cores
## Computing Variance Component Estimates...
## Sigma^2_A     log-lik     RSS
## [1]  156.52596  156.52596 -655.64424    1.35748
## [1]   48.23097  216.13551 -655.41039    1.31459
## [1]   19.787112  231.612743 -655.354057    1.304855
## [1]    5.438761  239.401757 -655.326345    1.300114
## [1]    1.837479  241.354535 -655.319461    1.298943
## [1]    0.03515157  242.33157265 -655.31602637    1.29835984
## Using 6 CPU cores
## Computing Variance Component Estimates...
## Sigma^2_A     log-lik     RSS
## [1]  123.996592  123.996592 -639.338760    1.382719
## [1]   60.871044  157.068136 -639.003531    1.359475
## [1]   28.060549  174.188048 -638.839321    1.348401
## [1]   11.386016  182.873897 -638.758257    1.342995
## [1]    2.987092  187.245608 -638.718008    1.340325
## [1]    0.8800161  188.3419618 -638.7079711    1.3396611
## [1]    0.3527979  188.6162597 -638.7054634    1.3394953
## [1]    0.08913293  188.75343478 -638.70420979    1.33941246
## [1]    0.00000  188.78773 -638.70379    1.33947
## [1]    0.000000  236.633363 -638.703786    1.068639
## [1]    0.000000  251.832293 -638.703786    1.004143
## [1]    0.000000  252.871221 -638.703786    1.000017
## Using 6 CPU cores
## Computing Variance Component Estimates...
## Sigma^2_A     log-lik     RSS
## [1]  161.385244  161.385244 -656.142070    1.325259
## [1]   38.484860  227.189174 -655.739495    1.286463
## [1]    7.468218  243.720938 -655.648218    1.277610
## [1]    3.598531  245.781954 -655.637095    1.276527
## [1]    1.664369  246.812015 -655.631558    1.275988
## [1]    0.6974669  247.3269292 -655.6287948    1.2757181
## [1]    0.2140616  247.5843569 -655.6274148    1.2755835
## [1]    0.09321604  247.64871016 -655.62706997    1.27554983
## [1]    0.032794  247.680886 -655.626898    1.275533
## Using 6 CPU cores
## Computing Variance Component Estimates...
## Sigma^2_A     log-lik     RSS
## [1]  119.222004  119.222004 -634.668187    1.352377
## [1]   72.31691  188.16110 -634.65008    1.07306
## [1]   34.572058  221.749578 -634.647680    1.004712
## [1]   33.875045  223.208366 -634.647680    1.000022
## [1]   33.97702  223.16392 -634.64768    1.00000
## Using 6 CPU cores
## Computing Variance Component Estimates...
## Sigma^2_A     log-lik     RSS
## [1]  126.182791  126.182791 -639.710426    1.365424
## [1]   16.109325  191.722220 -639.503496    1.280448
## [1]    2.155260  199.834267 -639.481554    1.271753
## [1]    0.4183262  200.8420620 -639.4788802    1.2706921
## [1]    0.2013401  200.9679317 -639.4785470    1.2705599
## [1]    0.09285534  201.03085991 -639.47838052    1.27049387
## [1]    0.03861503  201.06232238 -639.47829728    1.27046085
## [1]    0.000000  201.078053 -639.478238    1.270479
## [1]    0.000000  243.886684 -639.478238    1.047476
## [1]    0.000000  254.940734 -639.478238    1.002059
## [1]    0.000000  255.464462 -639.478238    1.000004
## Using 6 CPU cores
## Computing Variance Component Estimates...
## Sigma^2_A     log-lik     RSS
## [1]  150.583257  150.583257 -652.854433    1.360195
## [1]   46.569163  230.637049 -652.796187    1.201497
## [1]   18.016408  249.827625 -652.784980    1.176845
## [1]    4.077068  259.036820 -652.779997    1.165899
## [1]    0.6481105  261.2855537 -652.7788176    1.1633132
## [1]    0.2213884  261.5649227 -652.7786720    1.1629943
## [1]    0.000000  261.704499 -652.778597    1.162853
## [1]    0.000000  298.355137 -652.778597    1.020005
## [1]    0.000000  304.206731 -652.778597    1.000385
## [1]    0.0000  304.3237 -652.7786    1.0000
## Using 6 CPU cores
## Computing Variance Component Estimates...
## Sigma^2_A     log-lik     RSS
## [1]  151.852679  151.852679 -651.778154    1.329857
## [1]   12.783350  275.359797 -651.735319    1.065982
## [1]    0.5352815  282.4135798 -651.7329250    1.0618756
## [1]    0.1585763  282.6292272 -651.7328528    1.0617547
## [1]    0.06444825  282.68310193 -651.73283481    1.06172456
## [1]    0.000000  282.710035 -651.732822    1.061741
## [1]    0.000000  299.149902 -651.732822    1.003393
## [1]    0.000000  300.161489 -651.732822    1.000011
## Using 6 CPU cores
## Computing Variance Component Estimates...
## Sigma^2_A     log-lik     RSS
## [1]  126.628339  126.628339 -638.435559    1.337986
## [1]   50.251834  169.409202 -638.303894    1.297236
## [1]    8.702666  192.262902 -638.237318    1.278975
## [1]    3.321966  195.200179 -638.228912    1.276800
## [1]    0.620575  196.673553 -638.224709    1.275720
## [1]    0.2822175  196.8580172 -638.2241838    1.2755852
## [1]    0.1129962  196.9502678 -638.2239210    1.2755179
## [1]    0.02837488  196.99639756 -638.22378959    1.27548428
## Using 6 CPU cores
## Computing Variance Component Estimates...
## Sigma^2_A     log-lik     RSS
## [1]  122.896207  122.896207 -636.577454    1.345321
## [1]  273.275868   60.803516 -636.348322    1.264913
## [1]  347.745361   29.441973 -636.251661    1.232802
## [1]  383.793297   14.176274 -636.208468    1.218457
## [1]  401.431382    6.691432 -636.188159    1.211676
## [1]  410.14523    2.99038 -636.17832    1.20838
## [1]  414.474912    1.150673 -636.173483    1.206755
## [1]  416.632832    0.233580 -636.171083    1.205948
## [1]  417.171440    0.000000 -636.170473    1.205774
## [1]  488.364830    0.000000 -636.170473    1.029998
## [1]  502.587940    0.000000 -636.170473    1.000849
## [1]  503.014237    0.000000 -636.170473    1.000001
## Using 6 CPU cores
## Computing Variance Component Estimates...
## Sigma^2_A     log-lik     RSS
## [1]  190.736956  190.736956 -669.908268    1.343988
## [1]  593.982205   66.385958 -669.797092    1.071266
## [1]  683.872814   28.574067 -669.779845    1.053801
## [1]  722.47785   12.12013 -669.77339    1.04713
## [1]  740.311629    4.480398 -669.770599    1.044195
## [1]  748.8795236    0.8017044 -669.7693015    1.0428158
## [1]  749.9292910    0.3504937 -669.7691444    1.0426486
## [1]  750.4528689    0.1254204 -669.7690662    1.0425653
## [1]  750.714332    0.000000 -669.769023    1.042561
## [1]  781.361120    0.000000 -669.769023    1.001669
## [1]  782.663304    0.000000 -669.769023    1.000003
## Using 6 CPU cores
## Computing Variance Component Estimates...
## Sigma^2_A     log-lik     RSS
## [1]  155.733467  155.733467 -654.507323    1.344129
## [1]  355.166860   72.722910 -654.282169    1.263951
## [1]  462.90728   26.43672 -654.17315    1.23197
## [1]  517.572634    2.699657 -654.120544    1.217684
## [1]  520.996667    1.206249 -654.117301    1.216834
## [1]  522.7087181    0.4593385 -654.1156818    1.2164108
## [1]  523.5647486    0.0858339 -654.1148729    1.2161995
## [1]  523.67175257    0.03914279 -654.11477180    1.21617309
## [1]  523.725255    0.000000 -654.114687    1.216235
## [1]  616.838629    0.000000 -654.114687    1.032641
## [1]  636.3366    0.0000 -654.1147    1.0010
## [1]  636.972367    0.000000 -654.114687    1.000001
## Using 6 CPU cores
## Computing Variance Component Estimates...
## Sigma^2_A     log-lik     RSS
## [1]  157.322856  157.322856 -653.464436    1.312417
## [1]  506.167423   13.312896 -652.972372    1.175852
## [1]  527.184477    4.255995 -652.950370    1.170391
## [1]  532.384927    2.012384 -652.945035    1.169062
## [1]  534.9782567    0.8932444 -652.9423906    1.1684022
## [1]  536.2731859    0.3343478 -652.9410742    1.1680734
## [1]  536.92021510    0.05506854 -652.94041747    1.16790933
## [1]  537.001066    0.000000 -652.940288    1.167979
## [1]  614.232527    0.000000 -652.940288    1.021121
## [1]  626.937420    0.000000 -652.940288    1.000428
## [1]  627.2056    0.0000 -652.9403    1.0000
## Using 6 CPU cores
## Computing Variance Component Estimates...
## Sigma^2_A     log-lik     RSS
## [1]  136.049954  136.049954 -644.498831    1.348752
## [1]  140.972187  185.950193 -644.497171    1.071668
## [1]  122.330293  212.050481 -644.496435    1.004505
## [1]  110.592441  218.985607 -644.496320    1.000022
## [1]  106.8428  220.8184 -644.4963    1.0000
## [1]  105.7302  221.3605 -644.4963    1.0000
## [1]  105.4003  221.5212 -644.4963    1.0000
## [1]  105.3025  221.5689 -644.4963    1.0000
## [1]  105.2735  221.5830 -644.4963    1.0000
## Using 6 CPU cores
## Computing Variance Component Estimates...
## Sigma^2_A     log-lik     RSS
## [1]  142.960225  142.960225 -648.652339    1.355658
## [1]    9.028350  221.545197 -648.476636    1.272749
## [1]    4.438819  224.145937 -648.471299    1.270623
## [1]    2.141191  225.446944 -648.468641    1.269567
## [1]    0.9916964  226.0975906 -648.4673143    1.2690404
## [1]    0.4167828  226.4229477 -648.4666517    1.2687777
## [1]    0.1292849  226.5856344 -648.4663206    1.2686465
## [1]    0.0574054  226.6263071 -648.4662378    1.2686137
## [1]    0.000000  226.646644 -648.466172    1.268655
## [1]    0.000000  274.642226 -648.466172    1.046949
## [1]    0.000000  286.958271 -648.466172    1.002015
## [1]    0.000000  287.535339 -648.466172    1.000004
## Using 6 CPU cores
head(results$Metabolite5)
##   variant.id chr     pos n.obs      freq MAC        Score  Score.SE
## 1          1   1 1120590   156 0.9102564  28 -0.041612740 0.3013262
## 2          2   1 1500664   155 0.5612903 136 -0.592421817 0.4601268
## 3          3   1 1748914   156 0.7403846  81 -0.622250487 0.4715085
## 4          4   1 1802548   156 0.6858974  98  0.003261458 0.4753489
## 5          5   1 1813782   156 0.8653846  42  0.058626020 0.3435269
## 6          6   1 2017761   156 0.6987179  94 -0.044858793 0.4453027
##     Score.Stat Score.pval        Est   Est.SE          PVE  metabolite
## 1 -0.138098653  0.8901625 -0.4583029 3.318663 1.101861e-04 Metabolite5
## 2 -1.287518486  0.1979136 -2.7981816 2.173314 9.577558e-03 Metabolite5
## 3 -1.319701651  0.1869346 -2.7988929 2.120853 1.006235e-02 Metabolite5
## 4  0.006861187  0.9945256  0.0144340 2.103718 2.719859e-07 Metabolite5
## 5  0.170659168  0.8644918  0.4967854 2.910980 1.682702e-04 Metabolite5
## 6 -0.100737756  0.9197586 -0.2262231 2.245664 5.863169e-05 Metabolite5
dim(results$Metabolite5)
## [1] 67735    14
close(gds)
all_results <- bind_rows(results)  
significant_results <- filter(all_results, Score.pval < 0.0001)
dim(significant_results)
## [1] 206  14
write.xlsx(significant_results, file = "Dataset/Significant_results.xlsx", rowNames = FALSE)
write.xlsx(all_results, file = "Dataset/All_results.xlsx", rowNames = FALSE)
heritability_results <- lapply(names(null_models), function(met) {
  model <- null_models[[met]]
  h2_ci <- varCompCI(model, prop = TRUE)
  print(h2_ci)
})
##             Proportion Lower 95 Upper 95
## V_A                  0       NA       NA
## V_resid.var          1        1        1
##             Proportion Lower 95 Upper 95
## V_A                  0       NA       NA
## V_resid.var          1        1        1
##             Proportion Lower 95 Upper 95
## V_A                  1        1        1
## V_resid.var          0       NA       NA
##             Proportion Lower 95 Upper 95
## V_A                  1        1        1
## V_resid.var          0       NA       NA
##             Proportion Lower 95 Upper 95
## V_A                  1        1        1
## V_resid.var          0       NA       NA
##             Proportion Lower 95 Upper 95
## V_A                  1        1        1
## V_resid.var          0       NA       NA
##             Proportion Lower 95 Upper 95
## V_A                  1        1        1
## V_resid.var          0       NA       NA
##             Proportion Lower 95 Upper 95
## V_A                  1        1        1
## V_resid.var          0       NA       NA
##              Proportion  Lower 95 Upper 95
## V_A         0.000126439 -5.033504 5.033756
## V_resid.var 0.999873561 -4.033756 6.033504
##              Proportion  Lower 95 Upper 95
## V_A         0.000126439 -5.033504 5.033756
## V_resid.var 0.999873561 -4.033756 6.033504
##             Proportion  Lower 95 Upper 95
## V_A          0.8506954 -1.210683 2.912073
## V_resid.var  0.1493046 -1.912073 2.210683
##             Proportion  Lower 95 Upper 95
## V_A          0.8506954 -1.210683 2.912073
## V_resid.var  0.1493046 -1.912073 2.210683
##               Proportion  Lower 95 Upper 95
## V_A         0.0001450346 -5.631103 5.631393
## V_resid.var 0.9998549654 -4.631393 6.631103
##               Proportion  Lower 95 Upper 95
## V_A         0.0001450346 -5.631103 5.631393
## V_resid.var 0.9998549654 -4.631393 6.631103
##             Proportion Lower 95 Upper 95
## V_A                  0       NA       NA
## V_resid.var          1        1        1
##             Proportion Lower 95 Upper 95
## V_A                  0       NA       NA
## V_resid.var          1        1        1
##               Proportion  Lower 95 Upper 95
## V_A         0.0001323867 -4.660251 4.660515
## V_resid.var 0.9998676133 -3.660515 5.660251
##               Proportion  Lower 95 Upper 95
## V_A         0.0001323867 -4.660251 4.660515
## V_resid.var 0.9998676133 -3.660515 5.660251
##             Proportion  Lower 95 Upper 95
## V_A          0.1321338 -4.621856 4.886124
## V_resid.var  0.8678662 -3.886124 5.621856
##             Proportion  Lower 95 Upper 95
## V_A          0.1321338 -4.621856 4.886124
## V_resid.var  0.8678662 -3.886124 5.621856
##             Proportion Lower 95 Upper 95
## V_A                  0       NA       NA
## V_resid.var          1        1        1
##             Proportion Lower 95 Upper 95
## V_A                  0       NA       NA
## V_resid.var          1        1        1
##             Proportion Lower 95 Upper 95
## V_A                  0       NA       NA
## V_resid.var          1        1        1
##             Proportion Lower 95 Upper 95
## V_A                  0       NA       NA
## V_resid.var          1        1        1
##             Proportion Lower 95 Upper 95
## V_A                  0       NA       NA
## V_resid.var          1        1        1
##             Proportion Lower 95 Upper 95
## V_A                  0       NA       NA
## V_resid.var          1        1        1
##               Proportion  Lower 95 Upper 95
## V_A         0.0001440168 -6.646524 6.646812
## V_resid.var 0.9998559832 -5.646812 7.646524
##               Proportion  Lower 95 Upper 95
## V_A         0.0001440168 -6.646524 6.646812
## V_resid.var 0.9998559832 -5.646812 7.646524
##             Proportion Lower 95 Upper 95
## V_A                  1        1        1
## V_resid.var          0       NA       NA
##             Proportion Lower 95 Upper 95
## V_A                  1        1        1
## V_resid.var          0       NA       NA
##             Proportion Lower 95 Upper 95
## V_A                  1        1        1
## V_resid.var          0       NA       NA
##             Proportion Lower 95 Upper 95
## V_A                  1        1        1
## V_resid.var          0       NA       NA
##             Proportion Lower 95 Upper 95
## V_A                  1        1        1
## V_resid.var          0       NA       NA
##             Proportion Lower 95 Upper 95
## V_A                  1        1        1
## V_resid.var          0       NA       NA
##             Proportion Lower 95 Upper 95
## V_A                  1        1        1
## V_resid.var          0       NA       NA
##             Proportion Lower 95 Upper 95
## V_A                  1        1        1
## V_resid.var          0       NA       NA
##             Proportion  Lower 95 Upper 95
## V_A          0.3220787 -4.319544 4.963702
## V_resid.var  0.6779213 -3.963702 5.319544
##             Proportion  Lower 95 Upper 95
## V_A          0.3220787 -4.319544 4.963702
## V_resid.var  0.6779213 -3.963702 5.319544
##             Proportion Lower 95 Upper 95
## V_A                  0       NA       NA
## V_resid.var          1        1        1
##             Proportion Lower 95 Upper 95
## V_A                  0       NA       NA
## V_resid.var          1        1        1
herit_df <- do.call(rbind, heritability_results)

Task 3: Inflation factor calculation

Inflation Factor Formula
Inflation Factor Formula
# Function to compute lambda_GC from a vector of p‑values
calc_lambda <- function(p) {
  # drop invalid p's
  p <- p[!is.na(p) & p > 0 & p < 1]
  # convert to chi^2 (df=1) and take the median
  chisq_med <- median(qchisq(p, df = 1, lower.tail = FALSE))
  # theoretical median of X^2 is qchisq(0.5,1)
  lambda_gc  <- chisq_med / qchisq(0.5, 1)
  return(lambda_gc)
}

# Loop over your 'results' list
inflation_results <- lapply(names(results), function(met) {
  df <- results[[met]]
  lambda <- tryCatch(calc_lambda(df$Score.pval),
                     error = function(e) NA_real_)
  data.frame(
    metabolite = met,
    lambda_gc  = lambda,
    stringsAsFactors = FALSE
  )
})

inflation_df <- do.call(rbind, inflation_results)

# Compute the average lambda_GC across all metabolites
avg_lambda <- mean(inflation_df$lambda_gc, na.rm = TRUE)

# Add the average as a final summary row:
inflation_df <- rbind(
  inflation_df,
  data.frame(metabolite = "AVERAGE", lambda_gc = avg_lambda, stringsAsFactors = FALSE)
)

# Inspect
print(inflation_df)
##      metabolite lambda_gc
## 1   Metabolite1 0.9998043
## 2   Metabolite2 1.0089874
## 3   Metabolite3 1.0055276
## 4   Metabolite4 1.0045638
## 5   Metabolite5 1.1350615
## 6   Metabolite6 0.9907329
## 7   Metabolite7 1.3154792
## 8   Metabolite8 1.0069210
## 9   Metabolite9 1.2877979
## 10 Metabolite10 1.0444745
## 11 Metabolite11 1.0472325
## 12 Metabolite12 0.9958541
## 13 Metabolite13 0.9915045
## 14 Metabolite14 1.2936886
## 15 Metabolite15 0.9743886
## 16 Metabolite16 1.0120871
## 17 Metabolite17 0.9850725
## 18 Metabolite18 1.0178531
## 19 Metabolite19 1.0230257
## 20 Metabolite20 0.9842626
## 21      AVERAGE 1.0562160

Task 4: Create Manhattan Plots

# 1. Load the results
df <- all_results

# 2. Rename & clean columns
df$SNP        <- df$variant.id
df$BP         <- as.numeric(df$pos)
df$P          <- as.numeric(df$Score.pval)
df$Metabolite <- as.factor(df$metabolite)
df$chr[df$chr == "X"] <- "23"
df$CHR <- as.numeric(df$chr)

# 3. Output directory
out_dir <- "plots"
if (!dir.exists(out_dir)) dir.create(out_dir)

# 4. Thresholds
threshold_gw   <- 1e-4
threshold_sugg <- 1e-5

# 5. Per-metabolite ggplot2 Manhattan plots
mets <- levels(df$Metabolite)

for (met in mets) {
  subdf <- subset(df, Metabolite == met)
  subdf <- subdf[complete.cases(subdf[, c("CHR","BP","P")]), ]
  subdf <- subset(subdf, CHR %in% 1:23 & P > 0 & P <= 1)
  
  if (nrow(subdf) < 10) {
    warning(sprintf("Skipping %s: only %d valid SNPs", met, nrow(subdf)))
    next
  }
  
  # Add -log10(P)
  subdf$negLogP <- -log10(subdf$P)
  
  #png(
 #   filename = file.path(out_dir, paste0("Manhattan_", met, ".png")),
  #  width    = 12,
  #  height   = 6,
  #  units    = "in",
  #  res      = 300
 # )
  
  p <- manhattan(subdf,
            main = paste("Manhattan Plot —", met),
            col = c("skyblue", "orange"),
            suggestiveline = FALSE,
            genomewideline = -log10(1e-4))
  
  #dev.off()
}

df_unique <- df

df_unique$P <- -log10(df_unique$P)

# Save plot
#png("plots/Manhattan_plot_all_metabolites.png", width = 12, height = 6,units = "in", res = 300)

p_all <- manhattan(df_unique,
            main = "Combined Manhattan Plot (All Metabolites)",
            col = c("skyblue", "orange"),
            suggestiveline = FALSE,
            genomewideline = -log10(1e-4))

#dev.off()

Task 5: Metabolite Network

library(dplyr)
library(GeneNet)
library(RCy3)

# Create a matrix to hold the corrected values
corrected_matrix <- matrix(NA, nrow = nrow(metabolites), ncol = length(meta.names))
rownames(corrected_matrix) <- rownames(metabolites)
colnames(corrected_matrix) <- meta.names

# Fill matrix with residuals from each null model
for (met in meta.names) {
 model <- null_models[[met]]
 if (!is.null(model)) {
  fittedData <- model$fit$fitted.values
  corrected_matrix[, met] <- fittedData
 }
}

# Estimate partial correlations
pcor <- ggm.estimate.pcor(corrected_matrix)

# Test for significant associations
network_results <- network.test.edges(pcor)

# Filter significant pairs (pval < 0.05)
sig_pairs <- network_results %>% 
 filter(pval < 0.05) %>%
 arrange(desc(abs(pcor)))

# Save results
write.csv(sig_pairs, "Dataset/significant_pairs.csv", row.names = FALSE)

cytoscapePing()

# Prepare edges and nodes
edges <- data.frame(
 source = as.character(sig_pairs$node1),
 target = as.character(sig_pairs$node2),
 weight = sig_pairs$pcor,
 interaction = "interacts_with",
 pval = sig_pairs$pval,
 qval = sig_pairs$qval,
 stringsAsFactors = FALSE
)

nodes <- data.frame(
 id = unique(c(edges$source, edges$target)),
 name = unique(c(edges$source, edges$target)),
 stringsAsFactors = FALSE
)

# Create network in Cytoscape
createNetworkFromDataFrames(
 nodes = nodes,
 edges = edges,
 title = "Metabolic Network",
 collection = "Metabolite Correlations"
)

# Apply layout and visual styles
layoutNetwork("force-directed")

setNodeShapeDefault("ELLIPSE")
setNodeSizeDefault(30)
setNodeLabelMapping("name")

setEdgeLineWidthMapping(
 table.column = "weight",
 table.column.values = seq(min(abs(edges$weight)), max(abs(edges$weight)), length.out = 5),
 widths = seq(1, 3, length.out = 5),
 mapping.type = "continuous"
)

setEdgeColorMapping(
 table.column = "weight",
 table.column.values = c(min(edges$weight), 0, max(edges$weight)),
 colors = c("red", "blue", "yellow"),
 mapping.type = "continuous"
)


analyzeNetwork()
node.table <- getTableColumns(table = "node")
write.csv(node.table, "metabolite_network_node_metrics.csv", row.names = FALSE)
saveSession("Metabolite_Network_Analysis.cys")
print(head(node.table[, c("name", "Degree", "ClosenessCentrality", "BetweennessCentrality")]))
Metabolic Network
Metabolic Network

Task 6: Annotate Significant SNPs

Read the map file and extract the top 20 SNPs

map_data <- read.table("Dataset/input.map", header = FALSE, stringsAsFactors = FALSE)

# Assign column names
colnames(map_data) <- c("chr", "snp_id", "pos_cM", "pos")
map_data <- subset(map_data, select= -c(pos_cM))
# Read the Excel file
df <- read_excel("Dataset/Significant_results.xlsx", sheet = 1)

# Select top 20 unique SNPs based on lowest p-value
top_snps <- df[order(df$`Score.pval`), ]
top_snps <- top_snps[!duplicated(top_snps$`variant.id`), ]
top_snps <- subset(top_snps, select = c("chr", "pos"))
top_snps <- head(top_snps, 20)
top_snps$chr[top_snps$chr == "X"] <- 23
top_snps$chr <- as.integer(top_snps$chr)

top_20snps <- inner_join(top_snps, map_data, by = c("chr", "pos"))
# Save to a plain text file
writeLines(top_20snps$snp_id, "Dataset/snps_list.txt")

Extract the top 20 SNPs from .ped and .map files into new files using Plink.

plink --file input --extract snps_list.txt --make-bed --out top20_SNPs
Extraction of top 20 SNPs
Extraction of top 20 SNPs

Convert the top 20 SNPs to VCF format using PLINK.

Create VCF file
Create VCF file
plink --bfile top20_SNPs --recode vcf --out top20_SNPs_vcf

Download ANNOVAR and make sure you have Perl on your machine and you download the refGene annotation database for the human genome version hg19

sudo apt update
sudo apt install perl
perl -v
Download the required Database
Download the required Database
./annotate_variation.pl -buildver hg19 -downdb -webfrom annovar refGene humandb/

Annotate the top 20 SNPs using ANNOVAR

  • top20_SNPs_vcf.vcf: Input VCF file containing SNPs to annotate.

  • humandb/: Directory containing ANNOVAR’s annotation databases (e.g., refGene).

  • buildver hg19: Specifies the human genome version (hg19).

  • out output/annotated_output: Output prefix and directory for results.

  • remove: Delete intermediate files after annotation.

  • protocol refGene: Use the refGene database for annotation (gene-based).

  • operation g: The operation type is gene-based (g).

  • nastring .: Missing values in the output will be represented as ..

  • vcfinput: Indicates the input file is in VCF format.

  • polish: Refines annotations (e.g., handles multi-allelic variants better).

  • thread 4: Use 4 threads to speed up processing.

Annotate the top 20 significant SNPs
Annotate the top 20 significant SNPs
./table_annovar.pl top20_SNPs_vcf.vcf humandb/   -buildver hg19   -out output/annotated_output   -remove   -protocol refGene   -operation g   -nastring .   -vcfinput   -polish   -thread 4
data <- read.table("Annovar/annotated_output.hg19_multianno.txt", header = TRUE, sep = "", stringsAsFactors = FALSE)
View(data)

Task 7: Regional plots using SNIPA

Get the top 5 most significant unique associated SNPs

df <- read_excel("Dataset/Significant_results.xlsx", sheet = 1)

top_snps <- df[order(df$`Score.pval`), ]
top_snps <- top_snps[!duplicated(top_snps$`variant.id`), ]
top_snps <- subset(top_snps, select = c("chr", "pos"))
top_snps <- head(top_snps, 5)
top_snps$chr[top_snps$chr == "X"] <- 23
top_snps$chr <- as.integer(top_snps$chr)

top_5snps <- inner_join(top_snps, map_data, by = c("chr", "pos"))
# Save to a plain text file
writeLines(top_5snps$snp_id, "Dataset/top5_SNPs.txt")

The proxy SNPs in LD with the top 5 most significant unique associated SNPs.

rs470778_proxy
rs470778_proxy
rs10995706_proxy
rs10995706_proxy
rs625127_proxy
rs625127_proxy
rs10805888_proxy
rs10805888_proxy
rs10940680_proxy1
rs10940680_proxy1
rs10940680_proxy2
rs10940680_proxy2
rs10940680_proxy3
rs10940680_proxy3

Some proxy SNPs annotations for each significant SNP

rs470778 annotation example
rs470778 annotation example
rs10995706 annotation example 1
rs10995706 annotation example 1
rs10995706 annotation example 2
rs10995706 annotation example 2
rs625127 annotation example
rs625127 annotation example
rs10805888 annotation example
rs10805888 annotation example
rs10940680 annotation example
rs10940680 annotation example

The regional plots for the closest 5 SNPs in the proxy.

rs470778 region
rs470778 region
rs10995706 region
rs10995706 region
rs625127 region
rs625127 region
rs10805888 region
rs10805888 region
rs10940680 region
rs10940680 region